#!/usr/bin/env python

import pandas as pd
import pyfastx
import sys
import argparse
import os


parser = argparse.ArgumentParser(description='This script outputs general summary stats for an assembly provided \
                                              in fasta format. If given an output file, writes out a tsv, otherwise \
                                              prints to the screen. "Ambiguous characters" reports total counts of \
                                              of any letter that is not "A", "T", "C", or "G". For version info, run \
                                              `bit-version`.')

required = parser.add_argument_group('required arguments')

required.add_argument("input_assembly", metavar='fasta_file', type=str, nargs="+", help="Input assembly file(s).")

parser.add_argument("-o", "--output-tsv", help='Name of output tsv file (if none provided, prints to screen)', action="store", dest="output_tsv", default=False)

parser.add_argument("-t", "--transpose-output-tsv", help='Set this flag if we want to have the output table have genomes as rows rather than columns.', action="store_true")

if len(sys.argv)==1:
    parser.print_help(sys.stderr)
    sys.exit(0)

args = parser.parse_args()

## setting up master dataframe
df_colnames = []
for assembly in args.input_assembly:
    assembly_base = os.path.basename(assembly)
    df_colnames.append(assembly_base.rsplit(".", 1)[0])

df_index = ["Assembly", "Total contigs", "Total length", "Ambiguous characters", "GC content", "Maximum contig length", "Minimum contig length", "N50", "N75", "N90", "L50", "L75", "L90", "Num. contigs >= 100", "Num. contigs >= 500", "Num. contigs >= 1000", "Num. contigs >= 5000", "Num. contigs >= 10000", "Num. contigs >= 50000", "Num. contigs >= 100000"]

df = pd.DataFrame(columns = df_colnames, index = df_index)

for assembly in args.input_assembly:

    assembly_base = os.path.basename(assembly)
        
    assembly_name=assembly_base.rsplit(".", 1)[0]

    try:
        df.at["Assembly", str(assembly_name)] = assembly_name
    except AttributeError: 
        print("  An attribute exception was thrown by pandas. Maybe the inputs don't have unique names?")
        print("  As written, this cuts off the extension based on last period to generate names.")
        sys.exit(1)
    
    # putting in a catch if file is empty (which can happen if an assembly produced no contigs)
        # this will leave it in the table, but with NAs (written out as "NA")
    if os.stat(assembly).st_size == 0:
        continue

    fasta = pyfastx.Fasta(assembly)

    df.at["Total contigs", str(assembly_name)] = len(fasta)
    df.at["Total length", str(assembly_name)] = fasta.size

    num_ambiguous_chars = 0
    for key in fasta.composition:
        if key not in ["A","T","G","C"]:
            num_ambiguous_chars += fasta.composition[key]
    
    df.at["Ambiguous characters", str(assembly_name)] = num_ambiguous_chars
    df.at["GC content", str(assembly_name)] = round(fasta.gc_content, 2)
    df.at["Maximum contig length", str(assembly_name)] = len(fasta.longest)
    df.at["Minimum contig length", str(assembly_name)] = len(fasta.shortest)

    info_at_50 = fasta.nl(50)
    info_at_75 = fasta.nl(75)
    info_at_90 = fasta.nl(90)
    df.at["N50", str(assembly_name)] = info_at_50[0]
    df.at["N75", str(assembly_name)] = info_at_75[0]
    df.at["N90", str(assembly_name)] = info_at_90[0]
    df.at["L50", str(assembly_name)] = info_at_50[1]
    df.at["L75", str(assembly_name)] = info_at_75[1]
    df.at["L90", str(assembly_name)] = info_at_90[1]

    df.at["Num. contigs >= 100", str(assembly_name)] = fasta.count(100)
    df.at["Num. contigs >= 500", str(assembly_name)] = fasta.count(500)
    df.at["Num. contigs >= 1000", str(assembly_name)] = fasta.count(1000)
    df.at["Num. contigs >= 5000", str(assembly_name)] = fasta.count(5000)
    df.at["Num. contigs >= 10000", str(assembly_name)] = fasta.count(10000)
    df.at["Num. contigs >= 50000", str(assembly_name)] = fasta.count(50000)
    df.at["Num. contigs >= 100000", str(assembly_name)] = fasta.count(100000)

    # removing intermediate index file
    os.remove(assembly + ".fxi")


if args.output_tsv:
    # transposing if wanted:
    if args.transpose_output_tsv:
        df = df.T
        df.to_csv(args.output_tsv, sep="\t", index=False, na_rep = "NA")
    
    else:
        df.to_csv(args.output_tsv, sep="\t", header=False, na_rep = "NA")
    
else:
    print("")
    print(df.to_string(header=False))
    print("")
